{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "from py2neo import Graph, Node\n",
    "\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('medical_data3.csv', encoding='gbk')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 实体（症状）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 实体——所有症状"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "symptoms = []\n",
    "for each in df['症状']:\n",
    "    symptoms.extend(each.split(','))\n",
    "symptoms = set(symptoms)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 实体——疾病种类（上位）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "types = []\n",
    "for each in df['疾病种类']:\n",
    "    types.extend(each.split(','))\n",
    "types = set(types)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 实体——药治"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "drugs = []\n",
    "for each in df['药治']:\n",
    "    try:\n",
    "        drugs.extend(each.split(','))\n",
    "    except:\n",
    "        pass\n",
    "drugs = set(drugs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 疾病字典信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "disease_infos = [] # 疾病信息\n",
    "for idx, row in df.iterrows():\n",
    "    disease_infos.append(dict(row))\n",
    "# dict(row).keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 实体对关系（边）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def deduplicate(rels_old):\n",
    "    '''关系去重函数'''\n",
    "    rels_new = []\n",
    "    for each in rels_old:\n",
    "        if each not in rels_new:\n",
    "            rels_new.append(each)\n",
    "    return rels_new"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 关系：疾病——病因"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "rels_reason = []\n",
    "for idx, row in df.iterrows():\n",
    "    try:\n",
    "        for each in row['病因'].split(','):\n",
    "            rels_reason.append([row['疾病名称'], each])\n",
    "    except:\n",
    "        pass\n",
    "rels_reason = deduplicate(rels_reason)\n",
    "# rels_reason"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 关系：疾病——症状"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "rels_symptom = []\n",
    "for idx, row in df.iterrows():\n",
    "    for each in row['症状'].split(','):\n",
    "        rels_symptom.append([row['疾病名称'], each])\n",
    "rels_symptom = deduplicate(rels_symptom)\n",
    "# rels_symptom"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 关系：疾病——疾病（并发症）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "rels_acompany = []\n",
    "for idx, row in df.iterrows():\n",
    "    for each in row['并发症'].split(','):\n",
    "        rels_acompany.append([row['疾病名称'], each])\n",
    "rels_acompany = deduplicate(rels_acompany)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 关系：疾病——药物（药治）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "rels_drug = []\n",
    "for idx, row in df.iterrows():\n",
    "    try:\n",
    "        for each in row['药治'].split(','):\n",
    "            rels_drug.append([row['疾病名称'], each])\n",
    "    except:\n",
    "        pass\n",
    "rels_recommanddrug = deduplicate(rels_drug)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 关系：疾病——疾病种类（上下位）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "list index out of range",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32mC:\\AppData\\Local\\Temp/ipykernel_22008/225357704.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      6\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m         \u001b[0mbig\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'疾病名称'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m','\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;31m# 大上位\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m         \u001b[0msmall\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'疾病名称'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m','\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;31m# 小上位\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      9\u001b[0m         \u001b[0mrels_category\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mrow\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'疾病名称'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msmall\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     10\u001b[0m         \u001b[0mrels_department\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0msmall\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbig\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mIndexError\u001b[0m: list index out of range"
     ]
    }
   ],
   "source": [
    "rels_category = [] # 关系：疾病-种类\n",
    "rels_department = [] # 关系：小上位-大上位（例如：呼吸内科属于内科的一种）\n",
    "for idx, row in df.iterrows():\n",
    "    if len(row['疾病种类'].split(',')) == 1:\n",
    "        rels_category.append([row['疾病名称'], row['疾病种类']])\n",
    "    else:\n",
    "        big = row['疾病名称'].split(',')[0] # 大上位\n",
    "        small = row['疾病名称'].split(',')[1] # 小上位\n",
    "        rels_category.append([row['疾病名称'], small])\n",
    "        rels_department.append([small, big])\n",
    "rels_category = deduplicate(rels_category)\n",
    "rels_department = deduplicate(rels_department)\n",
    "# rels_category\n",
    "# rels_department"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 连接知识图谱"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = Graph('http://localhost:7474', auth=('neo4j', '123456'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 创建实体（节点）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 创建疾病实体"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = 0\n",
    "for disease_dict in disease_infos:\n",
    "    try:\n",
    "        node = Node(\"Disease\",\n",
    "                    name=disease_dict['疾病名称'],\n",
    "                    type=disease_dict['疾病种类'],\n",
    "                    cause=disease_dict['病因'])\n",
    "        g.create(node)\n",
    "        count += 1\n",
    "        print('创建疾病实体：', disease_dict['疾病名称'])\n",
    "    except:\n",
    "        pass\n",
    "print('共创建 {} 个疾病实体'.format(count))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 创建药物实体"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for each in drugs:\n",
    "    node = Node('Drug', name=each)\n",
    "    g.create(node)\n",
    "    print('创建实体 {}'.format(each))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 创建疾病种类实体"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for each in types:\n",
    "    node = Node('Type', name=each)\n",
    "    g.create(node)\n",
    "    print('创建实体 {}'.format(each))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 创建症状实体"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for each in symptoms:\n",
    "    node = Node('Symptom', name=each)\n",
    "    g.create(node)\n",
    "    print('创建实体 {}'.format(each))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 创建知识图谱的实体对关系"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_relationship(start_node, end_node, edges, rel_type, rel_name):\n",
    "    '''创建关系函数'''\n",
    "    for edge in edges:\n",
    "        p = edge[0]\n",
    "        q = edge[1]\n",
    "        # 创建关系的 Cypher 语句\n",
    "        query = \"match(p:%s),(q:%s) where p.name='%s' and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)\" % (start_node, end_node, p, q, rel_type, rel_name)\n",
    "        try:\n",
    "            g.run(query) # 运行 Cypher 语句\n",
    "            print('创建关系 {}-{}->{}'.format(p, rel_type, q))\n",
    "        except Exception as e:\n",
    "            print(e)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 创建所有关系"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_relationship('Disease', 'Reason', rels_reason, 'reason', '病因')\n",
    "create_relationship('Disease', 'Drug', rels_drug, 'drug', '药治')\n",
    "create_relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状')\n",
    "create_relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症')\n",
    "create_relationship('Disease', 'Department', rels_category, 'belongs_to', '疾病种类')\n",
    "# 小上位——大上位\n",
    "create_relationship('Department', 'Department', rels_department, 'belongs_to', '属于')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 删除所有节点"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cypher = 'MATCH (n) DETACH DELETE n'\n",
    "g.run(cypher)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
